Recap

The summary statistics and respective plots along with eSet creation is in 00_summstats_eset.{Rmd, html}, imputation with random forest is in 01_pml_imp_summary.{Rmd, html}, DimRed analysis is in 02_dimred_hvg.{Rmd, html}. The differential expression analysis using DESeq2 along with significant marker info. is in 03_diffanalysis.{Rmd, html}. This file contains GSVA analysis from signatures coming from relevant datasets along with hypeR GSEA from the signatures collected from differential anal results.

library(ggplot2)
library(stringr)
library(DT)
library(plyr)
library(dplyr)
library(biomaRt)
library(Biobase)
library(reshape2)
library(formattable)
library(VennDiagram)
library(hypeR)
library(xlsx)
library(DESeq2)
library(gridExtra)
library(ggsci)
PATH <- getwd()

Load eSet

eset <- readRDS(file.path(PATH, "data/2021_08_20_eset_imputed_updated.RDS"))
eSet_wo_infl <- eset
table(eSet_wo_infl$Class)
## 
##   Control Dysplasia      HkNR      OSCC 
##        18        22        17         9
eSet_wo_infl$Class <- recode(eSet_wo_infl$Class, "Cancer"="OSCC")
eSet_wo_infl$Class <- factor(eSet_wo_infl$Class, levels = c("Control", "HkNR", "Dysplasia", "OSCC"))

cpm_eset <- eSet_wo_infl
exprs(cpm_eset) <- apply(exprs(cpm_eset), 2, function(x) {x/(sum(x)/1000000)})
print(dim(cpm_eset))
## Features  Samples 
##    21510       66
cpm_eset$Class <- recode(cpm_eset$Class, "Control"="1-Control", "HkNR"="2-HkNR", "Dysplasia"="3-Dysplasia", "OSCC"="4-OSCC")
cpm_eset$Class <- factor(cpm_eset$Class, levels = c("1-Control", "2-HkNR", "3-Dysplasia", "4-OSCC"))

list_signs <- readRDS(file.path(PATH, "results/06_22_pml_signatures_w_sex_smoke_logFC1.5_fdr0.05.RDS"))

Global signatures from OPMD->TCGA_HNSCC->pEMT

OPMD signatures

opmd <- list(up=c("SPRR2B", "DLX2", "SPRR2C", "CERS1", "CKB", "CYP19A1", "CARMIL3", "H2AC14", 
         "TUBA1B"),
         down=c("IER3", "NGEF", "TUBA4A", "ACP6", "SPIDR", "CD46", "GNPTAB", "LCA5", "ZMAT1",
         "SLC9A9", "ZNF204P", "PTCHD1", "FAM46A", "LGR5", "MUC1", "COLCA2", "DM1-AS", "ZNF418", 'NECTIN3', "MLPH", 
         "CCDC129", "TFCP2L1", "ATP6V1B1", "CRACR2B", "ERN2", "UGT1A6", "TLX1", "MUC16"))

gsva_res_opmd <-  as.data.frame(t(GSVA::gsva(exprs(cpm_eset), opmd, verbose=FALSE)))
gsva_res_opmd$diff <- gsva_res_opmd$up-gsva_res_opmd$down
gsva_res_opmd$Class <- cpm_eset$Class[match(rownames(gsva_res_opmd), colnames(cpm_eset))]
opmd <- ggplot2::ggplot(gsva_res_opmd, ggplot2::aes_string(
    x = 'Class', y = 'diff', fill='Class')) +
    ggplot2::geom_boxplot() +
    scale_color_npg(alpha = 0.8)+
    scale_fill_npg(alpha = 0.7)+
    ggplot2::geom_jitter(size=0.3) +
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    labs(y = "GSVA Score", x = "Type")+
    theme_bw()+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
                 axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
                 axis.text = element_text(size = 10, family='Helvetica', color="#222222"), 
                 axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
                 legend.text = ggplot2::element_text(family='Helvetica', size=(10),  color="#222222"))

  
opmd

TCGA Sigs

tcga <- readRDS("~/Documents/Research/pml/tcga_sigs.rds")
tcga_res <- GSVA::gsva(expr = exprs(cpm_eset), gset.idx.list = tcga)
## Estimating GSVA scores for 2 gene sets.
## Estimating ECDFs with Gaussian kernels
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================================================| 100%
tcga_res_df <- as.data.frame(t(tcga_res))
tcga_res_df$diff <- tcga_res_df$up - tcga_res_df$down
#gsvaViolinplot(gsvaData = t(tcga_res_df), textsize = 10, eset = cpm_eset, title = 'TCGA')


tcga_res_df$Class <- cpm_eset$Class[match(rownames(tcga_res_df), colnames(cpm_eset))]
up <- ggplot2::ggplot(tcga_res_df, ggplot2::aes_string(
    x = 'Class', y = 'up', fill='Class')) +
    ggplot2::geom_boxplot() +
    ggplot2::geom_jitter(size=0.3) +
    scale_color_npg(alpha = 0.8)+
    scale_fill_npg(alpha = 0.7)+
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    labs(y = "GSVA Score", x = "Type")+
    theme_bw()+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
                 axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
                 axis.text = element_text(size = 10, family='Helvetica', color="#222222"), 
                 axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
                 legend.text = ggplot2::element_text(family='Helvetica', size=(10),  color="#222222"))

  
up

down <- ggplot2::ggplot(tcga_res_df, ggplot2::aes_string(
    x = 'Class', y = 'down', fill='Class')) +
    ggplot2::geom_boxplot() +
    ggplot2::geom_jitter(size=0.3) +
    scale_color_npg(alpha = 0.8)+
    scale_fill_npg(alpha = 0.7)+
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    labs(y = "GSVA Score", x = "Type")+
    theme_bw()+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
                 axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
                 axis.text = element_text(size = 10, family='Helvetica', color="#222222"), 
                 axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
                 legend.text = ggplot2::element_text(family='Helvetica', size=(10),  color="#222222"))

  
down

tcga <- ggplot2::ggplot(tcga_res_df, ggplot2::aes_string(
    x = 'Class', y = 'diff', fill='Class')) +
    ggplot2::geom_boxplot() +
    ggplot2::geom_jitter(size=0.3) +
    scale_color_npg(alpha = 0.8)+
    scale_fill_npg(alpha = 0.7)+
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    labs(y = "GSVA Score", x = "Type")+
    theme_bw()+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
                 axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
                 axis.text = element_text(size = 10, family='Helvetica', color="#222222"), 
                 axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
                 legend.text = ggplot2::element_text(family='Helvetica', size=(10),  color="#222222"))

  
tcga

pEMT Signatures

hnsc_sign <- readRDS(file.path("~/Documents/Research/HNSC_CuratedGenesets.rds"))
hnsc_sign_epi <-  hnsc_sign[names(hnsc_sign) %in% c("Cell_Cycle_PMID29198524", "pEMT_PMID29198524", "Epithelial_Differentiation_1_PMID29198524", "Epithelial_Differentiation_2_PMID29198524", "Stress_PMID29198524", "Hypoxia_PMID29198524")]
names(hnsc_sign_epi)<- sapply(names(hnsc_sign_epi), function(x) str_split(x, pattern = "_PMID")[[1]], USE.NAMES=FALSE)[1,]
names(hnsc_sign_epi) <- recode(names(hnsc_sign_epi), "Epithelial_Differentiation_1"="Epi. Diff. 1", "Epithelial_Differentiation_2"="Epi. Diff. 2")
gsva_res_epi <- as.data.frame(t(GSVA::gsva(exprs(cpm_eset), hnsc_sign_epi, verbose=FALSE)))
gsva_res_epi$Class <- cpm_eset$Class[match(rownames(gsva_res_epi), colnames(cpm_eset))]

pemt <- ggplot2::ggplot(gsva_res_epi, ggplot2::aes_string(
    x = 'Class', y = 'pEMT', fill='Class')) +
    ggplot2::geom_boxplot() +
    scale_color_npg(alpha = 0.8)+
    scale_fill_npg(alpha = 0.7)+
    ggplot2::geom_jitter(size=0.3) +
    labs(y = "GSVA Score", x = "Type")+ 
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    theme_bw()+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
                 axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
                 axis.text = element_text(size = 10, family='Helvetica', color="#222222"), 
                 axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
                 legend.text = ggplot2::element_text(family='Helvetica', size=(10),  color="#222222"))


pemt

Bronchial PMLs

J. Beane Bronchial PML

pml_genes <- readRDS("~/Downloads/JBeane_PML/combined_gene_set_symbols.rds")
#names(pml_genes) <- paste("module", seq_len(length(names(pml_genes))), sep = "")
names(pml_genes) <- paste(c("module6", "module1", "module5",  "module8", "module3", "module2", "module9", "module7", "module4"), sep = "")
#selecting modules that show a positive trend
pml_genes_prolif <- pml_genes[c('module5','module9', 'module8')]

prolif_gsva <- GSVA::gsva(exprs(cpm_eset), pml_genes_prolif)
## Estimating GSVA scores for 3 gene sets.
## Estimating ECDFs with Gaussian kernels
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |======================================================================| 100%
prolif_res_df <- as.data.frame(t(prolif_gsva))
prolif_res_df$Class <- cpm_eset$Class
prolif1 <- ggplot2::ggplot(prolif_res_df, ggplot2::aes_string(
    x = 'Class', y = 'module5', fill='Class')) +
    ggplot2::geom_boxplot() +
    scale_color_npg(alpha = 0.8)+
    scale_fill_npg(alpha = 0.7)+
    ggplot2::geom_jitter(size=0.3) +
    labs(y = "GSVA Score", x = "Type", title = paste("Module 5 - Cell     ", "Cycle", sep = "\n"))+ 
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    theme_bw()+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
                 axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
                 axis.text = element_text(size = 10, family='Helvetica', color="#222222"), 
                 axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
                 legend.text = ggplot2::element_text(family='Helvetica', size=(10),  color="#222222"),legend.position = "none")


prolif1

prolif2 <- ggplot2::ggplot(prolif_res_df, ggplot2::aes_string(
    x = 'Class', y = 'module9', fill='Class')) +
    ggplot2::geom_boxplot() +
    scale_color_npg(alpha = 0.8)+
    scale_fill_npg(alpha = 0.7)+
    ggplot2::geom_jitter(size=0.3) +
    labs(y = "GSVA Score", x = "Type", title = paste("Module 9 - Interferon",  "Signaling", sep = "\n"))+ 
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    theme_bw()+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
                 axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
                 axis.text = element_text(size = 10, family='Helvetica', color="#222222"), 
                 axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
                 legend.text = ggplot2::element_text(family='Helvetica', size=(10),  color="#222222"), legend.position = "none")


prolif2

prolif3 <- ggplot2::ggplot(prolif_res_df, ggplot2::aes_string(
    x = 'Class', y = 'module8', fill='Class')) +
    ggplot2::geom_boxplot() +
    scale_color_npg(alpha = 0.8)+
    scale_fill_npg(alpha = 0.7)+
    ggplot2::geom_jitter(size=0.3) +
    labs(y = "GSVA Score", x = "Type", title = paste("Module 8 - Inflammatory",  "Response", sep = "\n"))+ 
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    theme_bw()+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
                 axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
                 axis.text = element_text(size = 10, family='Helvetica', color="#222222"), 
                 axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
                 legend.text = ggplot2::element_text(family='Helvetica', size=(10),  color="#222222"),legend.position = "none")


prolif3

ggarr <- ggpubr::ggarrange(prolif1, prolif2, prolif3, ncol = 3, nrow = 1, widths = c(3,4,4))
ggarr

#ggsave(ggarr, filename = file.path(PATH, "results/06_20_beane_modules.png"), width = 8, height = 4, dpi = 300)

CAF Signatures

prog <- data.frame(t(exprs(cpm_eset)[c("PDGFRB", "COL1A1", "COL1A2", "COL3A1"),]))

gsvaResT <- prog
condition <- 'Class'
cond <- apply(pData(cpm_eset)[, "Class",drop = FALSE],1, paste, collapse = "_")
gsvaResT[, paste(condition, collapse = "_")] <- cond
#gsvaResT$Class <- dplyr::recode_factor(gsvaResT$Class, 'Control'='1-Control',  'HkNR'='2-HkNR', 'Dysplasia'='3-Dysplasia', 'Cancer'='4-Cancer')

gsvaResFlat <- reshape2::melt(gsvaResT, id.vars = paste(condition, collapse = "_"), variable.name = "pathway")

g1 <- ggplot2::ggplot(gsvaResFlat, ggplot2::aes_string(
  x = paste(condition, collapse = "_"), y = "value",
  color = paste(condition, collapse = "_"))) +
  ggplot2::geom_boxplot() +
  scale_color_npg(alpha = 0.8)+
  scale_fill_npg(alpha = 0.7)+
  ggplot2::geom_jitter(size=0.3) +
  ggplot2::facet_wrap(~pathway, scale = "free_y",
                      ncol = nrow(prog)) +
  #ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
  ggplot2::theme(strip.text.x = ggplot2::element_text(size = 10))+
  ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(title = 'Fibroblast markers', y = "Counts(CPM)", x = "Class")+
  theme(axis.text = element_text(size = 10), axis.title = element_text(size = 10, face = "bold"))
g1

ggsave(g1, filename = file.path(PATH, "results/06_20_fibr_markers.png"), width = 8, height = 4, dpi = 300)
pdgfb <- xlsx::read.xlsx(file = file.path(PATH, "data/pdgfb_kartha_plosone_2016.xlsx"), sheetIndex = 1)
pdgfb_genes <- pdgfb %>% dplyr::filter(Pearson.r >= 0.80)  %>% dplyr::select(Gene_ID)

pdgfb_sig <- list("pdgfb"=vapply(strsplit(pdgfb_genes$Gene_ID, "|", fixed = TRUE), "[", "", 1))
pdgfb_gsva_res1 <- GSVA::gsva(expr = exprs(cpm_eset), gset.idx.list = pdgfb_sig)
## Estimating GSVA scores for 1 gene sets.
## Estimating ECDFs with Gaussian kernels
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
gsvaViolinplot <- function(gsvaData, textsize, eset, title) {
  gsvaResT <- data.frame(t(gsvaData))
  condition <- 'Class'
  cond <- apply(pData(cpm_eset)[, "Class",drop = FALSE],1, paste, collapse = "_")
  gsvaResT[, paste(condition, collapse = "_")] <- cond
  gsvaResFlat <- reshape2::melt(gsvaResT, id.vars = paste(condition,
                                                          collapse = "_"),
                                variable.name = "pathway")
  ggplot2::ggplot(gsvaResFlat, ggplot2::aes_string(
    x = paste(condition, collapse = "_"), y = "value",
    color = paste(condition, collapse = "_"))) +
    ggplot2::geom_boxplot() +
    scale_color_npg(alpha = 0.8)+
  scale_fill_npg(alpha = 0.7)+
  ggplot2::geom_jitter(size=0.3) +
    ggplot2::facet_wrap(~pathway, scale = "free_y",
                        ncol = ceiling(sqrt(nrow(gsvaData)))) +
    ggpubr::stat_compare_means(method = "anova",  label = 'p.format')+
    ggplot2::theme(strip.text.x = ggplot2::element_text(size = textsize))+
    ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    labs(title = title, y = "GSVA Score", x = "Class")+
    theme(axis.text = element_text(size = 10), axis.title = element_text(size = 10, face = "bold"))
}

g2 <- gsvaViolinplot(gsvaData = pdgfb_gsva_res1, textsize = 8,eset = cpm_eset, title = "PDGFRB sigs")
g2

#ggsave(g2, filename = file.path(PATH, "results/06_20_pdgfrb_sigs.png"), width = 4, height = 4, dpi = 300)

pEMT ~ pdgfr sigs

hnsc_sign <- readRDS(file.path(PATH, "data/HNSC_CuratedGenesets.rds"))
hnsc_sign_epi <-  hnsc_sign[names(hnsc_sign) %in% c("Cell_Cycle_PMID29198524", "pEMT_PMID29198524", "Epithelial_Differentiation_1_PMID29198524", "Epithelial_Differentiation_2_PMID29198524", "Stress_PMID29198524", "Hypoxia_PMID29198524")]
names(hnsc_sign_epi)<- sapply(names(hnsc_sign_epi), function(x) str_split(x, pattern = "_PMID")[[1]], USE.NAMES=FALSE)[1,]

hnsc_pemt <- hnsc_sign_epi['pEMT']

gsva_pemt <- GSVA::gsva(expr = exprs(eSet_wo_infl), gset.idx.list = hnsc_pemt)
## Estimating GSVA scores for 1 gene sets.
## Estimating ECDFs with Gaussian kernels
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
gsvaViolinplot(gsvaData = gsva_pemt, textsize = 8, eset = cpm_eset, title = "pEMT sigs")

df_plasticity <- data.frame(t(gsva_pemt), t(pdgfb_gsva_res1), Class=cpm_eset$Class)
df_plasticity$Class <- factor(df_plasticity$Class, levels = c("1-Control", "2-HkNR", "3-Dysplasia", "4-OSCC"))
ggplot(df_plasticity,aes(pEMT, pdgfb)) +
  stat_summary(fun.data=mean_cl_normal) + 
  geom_smooth(method='lm', formula= y~x, ) +
   labs(y = "PDGFRb GSVA scores", x = "pEMT GSVA scores")
## Warning: Removed 66 rows containing missing values (geom_segment).

sp1 <- ggpubr::ggscatter(df_plasticity, x = "pEMT", y = "pdgfb",
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE # Add confidence interval
   )
# Add correlation coefficient
sp1 +  ggpubr::stat_cor(method = "pearson") +
   labs(y = "PDGFRb GSVA scores", x = "pEMT GSVA scores")
## `geom_smooth()` using formula 'y ~ x'

sp2 <- ggpubr::ggscatter(df_plasticity, x = "pEMT", y = "pdgfb", col="Class",
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE # Add confidence interval
   ) +
  scale_color_npg(alpha = 0.8)+
  scale_fill_npg(alpha = 0.7)+
  ggplot2::geom_jitter(size=0.3) 
# Add correlation coefficient
sp2_final <- sp2 +  
  ggpubr::stat_cor(method = "pearson")  +
   labs(y = "PDGFRb GSVA scores", x = "pEMT GSVA scores")

#ggsave(plot = sp2_final, file.path(PATH, "results/06_20_pemt_pdgf.png"), width = 6, height = 4, dpi = 300)

ggplot(df_plasticity,aes(pEMT, pdgfb,  col=Class)) +
  geom_point() +
  scale_color_npg(alpha = 0.8)+
  scale_fill_npg(alpha = 0.7)+
  ggplot2::geom_jitter(size=0.3) +
  ggpubr::stat_cor(method = "pearson")+
  labs(y = "PDGFRb signature", x = "pEMT signature")

ggplot(df_plasticity,aes(pEMT, pdgfb,  col=Class)) +
  stat_summary(fun.data=mean_cl_normal) + 
  geom_smooth(method='lm', formula= y~x) +
  scale_color_npg(alpha = 0.8)+
  scale_fill_npg(alpha = 0.7)+
  ggplot2::geom_jitter(size=0.3) +
  facet_wrap(~Class, ncol = 4) +
  labs(y = "PDGFRb signature", x = "pEMT signature")
## Warning: Removed 18 rows containing missing values (geom_segment).
## Warning: Removed 17 rows containing missing values (geom_segment).
## Warning: Removed 22 rows containing missing values (geom_segment).
## Warning: Removed 9 rows containing missing values (geom_segment).

hypeR

hypeR is run on BIOCARTA, KEGG, REACTOME separately and the .Rmd files for each is separate files. The results are stored in 0.Supporting Material>results>PML_Pathways in the GDrive folder.

Hallmark

with fdr = 0.1

rctbl_mhyp_2 <- function(mhyp,
                       show_emaps=FALSE,
                       show_hmaps=FALSE,
                       hyp_emap_args=list(top=25, val="fdr"),
                       hyp_hmap_args=list(top=25, val="fdr"), 
                       searchable = TRUE, resizable = TRUE) {

    mhyp.df <- data.frame(signature=names(mhyp$data), 
                          size=sapply(mhyp$data, function(hyp) hyp$info[["Signature Size"]]),
                          enriched=sapply(mhyp$data, function(hyp) nrow(hyp$data)),
                          gsets=sapply(mhyp$data, function(hyp) hyp$info[["Genesets"]]),
                          bg=sapply(mhyp$data, function(hyp) hyp$info[["Background"]]))
    
    tbl <- reactable(
        mhyp.df,
        showPageSizeOptions = FALSE,
        onClick = "expand",
        resizable = TRUE,
        rownames = FALSE,
        searchable = TRUE,
        filterable = TRUE,
        defaultColDef = colDef(headerClass="rctbl-outer-header"),
        columns = list(signature = colDef(name="Signature", minWidth=300),
                       size = colDef(name="Signature Size"),
                       enriched = colDef(name="Enriched Genesets"),
                       gsets = colDef(name="Genesets"),
                       bg = colDef(name="Background")),
        
        details = function(index) {
                hyp <- mhyp$data[[index]]
                rctbl_hyp(hyp, type="inner", show_emaps, show_hmaps, hyp_emap_args, hyp_hmap_args)
            },
        wrap = FALSE,
        class = "rctbl-outer-tbl",
        rowStyle = list(cursor="pointer")
        )
    
    htmltools::div(class="rctbl-outer-obj", tbl) 
}
HALLMARK <-  msigdb_gsets("Homo sapiens", "H", "")
names(HALLMARK$genesets) <- names(HALLMARK$genesets) %>% strsplit( "HALLMARK_" ) %>% sapply( tail, 1 )

lmultihyp1 <- hypeR(c(list_signs[[1]],list_signs[[2]], list_signs[[3]]) , HALLMARK, background = nrow(eSet_wo_infl))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
rctbl_mhyp(lmultihyp1)
hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, top = 15, title = "All Pairwise")

#hyp_to_excel(lmultihyp1, file_path = "~/Documents/Research/pml/pml_wo_infl/0906_hyper.xlsx")

#for manuscript
p1 <- hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, top = 15) +
    labs(y="", x="") + theme_bw() +
    theme_bw() +
    theme(axis.title.y=element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none")
p1

Reactome

Generic pathways

REACTOME <- msigdb_gsets(species="Homo sapiens", category="C2", subcategory="CP:REACTOME", clean = TRUE)
names(REACTOME$genesets) <- names(REACTOME$genesets) %>% strsplit( "REACTOME_" ) %>% sapply( tail, 1 )

lmultihyp2 <- hypeR(c(list_signs[[1]],list_signs[[2]], list_signs[[3]]) , REACTOME, background = rownames(eSet_wo_infl))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp2, fdr=0.1, merge = TRUE,  title = "Pairwise w/ background=rownames(eset)")

rctbl_mhyp(lmultihyp2)
lmultihyp3 <- hypeR(c(list_signs[[1]],list_signs[[2]], list_signs[[3]]) , REACTOME, background = nrow(eSet_wo_infl))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp3, top = 15, fdr=0.1, merge = TRUE,  title = "Pairwise w/ background=nrow(eset)")

rctbl_mhyp(lmultihyp3)
# write.xlsx(lmultihyp3$data$cancer.vs.control_up$as.data.frame(), file = "~/Documents/Research/pml/pml_wo_infl/08_30_host_hyper_reactome.xlsx", sheetName = "cancer.vs.control")
# 
# for (i in 1: length(names(lmultihyp3$data))) {
#   gc()
#   write.xlsx(lmultihyp3$[names(lmultihyp3$data[i])], file = "~/Documents/Research/pml/pml_wo_infl/08_30_host_hyper_reactome.xlsx", sheetName = names(lmultihyp3$data[i]), append = T)
# }

Hierarchical

Creating heirarchical genesets

genesets <- REACTOME$genesets
length(genesets)
## [1] 1604

Clustering

suppressPackageStartupMessages(library(hierarchicalSets))
suppressPackageStartupMessages(library(qdapTools))

mat <- genesets %>%
    qdapTools::mtabulate() %>%
    as.matrix() %>%
    t() %>%
    hierarchicalSets::format_sets()
## Warning in format_sets.matrix(.): Values above 1 set to 1
hierarchy <- hierarchicalSets::create_hierarchy(mat, intersectLimit=1)
print(hierarchy)
## A HierarchicalSet object
## 
##                  Universe size: 10968
##                 Number of sets: 1604
## Number of independent clusters: 272
plot(hierarchy, type='intersectStack', showHierarchy=TRUE, label=FALSE)

plot(hierarchy, type='outlyingElements', quantiles=0.75, alpha=0.5, label=FALSE)

suppressPackageStartupMessages(library(dendextend))
suppressPackageStartupMessages(library(stringi))
find.trees <- function(d) {
    subtrees <- dendextend::partition_leaves(d)
    leaves <- subtrees[[1]]
    find.paths <- function(leaf) {
        which(sapply(subtrees, function(x) leaf %in% x))
    }
    paths <- lapply(leaves, find.paths)
    edges <- data.frame(from=c(), to=c())
    if (length(d) > 1) {
        for (path in paths) {
            for (i in seq(1, length(path)-1)) {
                edges <- rbind(edges, data.frame(from=path[i], to=path[i+1]))
            }
        }
        edges <- dplyr::distinct(edges)
        edges$from <- paste0("N", edges$from)
        edges$to <- paste0("N", edges$to)
    }
    names(subtrees) <- paste0("N", seq(1:length(subtrees)))
    nodes <- data.frame(id=names(subtrees))
    rownames(nodes) <- nodes$id
    nodes$label <- ""
    leaves <- sapply(subtrees, function(x) length(x) == 1)
    nodes$label[leaves] <- sapply(subtrees[leaves], function(x) x[[1]])
    nodes$id <- NULL
    
    # Internal nodes will not have labels, so we can generate unique hash identifiers
    ids <- stringi::stri_rand_strings(nrow(nodes), 32)
    names(ids) <- rownames(nodes)
    rownames(nodes) <- ids[rownames(nodes)]
    edges$from <- ids[edges$from]
    edges$to <- ids[edges$to]
    
    return(list("edges"=edges, "nodes"=nodes))
}
trees <- lapply(hierarchy$clusters, find.trees)
length(trees)
## [1] 272

Nodes

nodes.all <- lapply(trees, function(x) x$nodes)
nodes <- do.call(rbind, nodes.all)
head(nodes)
##                                                         label
## 6SahW6t1EjGDYvbkaKXAAGmbsCdAUIxw                             
## dC9MsJ3GjmP2eZY89BOeBpA8k5QcUj9d     Mitochondrial Uncoupling
## yCkQDgF2o16F91Ce6Qnl1eClxbdpTsz8 The Fatty Acid Cycling Model
## 7Ix7Zm741GJd4euDmBlmtdmx2J4Zkexb                             
## PBDprVcM2IOyAyNcN2ZWzn909yvrkN0a Metallothioneins Bind Metals
## OI1C0HbU1QXP8nsCjvmxPJa9JCk2M70e       Response To Metal Ions

Edges

edges.all <- lapply(trees, function(x) x$edges)
edges <- data.frame(do.call(rbind, edges.all))
head(edges)
##                               from                               to
## 1 6SahW6t1EjGDYvbkaKXAAGmbsCdAUIxw dC9MsJ3GjmP2eZY89BOeBpA8k5QcUj9d
## 2 6SahW6t1EjGDYvbkaKXAAGmbsCdAUIxw yCkQDgF2o16F91Ce6Qnl1eClxbdpTsz8
## 3 7Ix7Zm741GJd4euDmBlmtdmx2J4Zkexb PBDprVcM2IOyAyNcN2ZWzn909yvrkN0a
## 4 7Ix7Zm741GJd4euDmBlmtdmx2J4Zkexb OI1C0HbU1QXP8nsCjvmxPJa9JCk2M70e
## 5 IXs3CeDFti8f9BfYIi5cheePqtIDBHu2 4EngvP3aQLtcz3YJlVk5SEtKKZ2Ukq4P
## 6 IXs3CeDFti8f9BfYIi5cheePqtIDBHu2 Oyut2jol4tUQ3Ag8eo2MVtir655DSbqx

Create the relational genesets object

#genesets <- hyperdb_rgsets("REACTOME", version="70.0")
rgsets_obj <- rgsets$new(genesets, nodes, edges, name="REACTOME", version=msigdb_version())
rgsets_obj
## REACTOME v7.4.1 
## 
## Genesets
## 
## 2 Ltr Circle Formation (7)
## A Tetrasaccharide Linker Sequence Is Required For Gag Synthesis (26)
## Abacavir Metabolism (5)
## Abacavir Transmembrane Transport (5)
## Abacavir Transport And Metabolism (10)
## Abc Family Proteins Mediated Transport (136)
## 
## Nodes
## 
##                                                         label
## 6SahW6t1EjGDYvbkaKXAAGmbsCdAUIxw                             
## dC9MsJ3GjmP2eZY89BOeBpA8k5QcUj9d     Mitochondrial Uncoupling
## yCkQDgF2o16F91Ce6Qnl1eClxbdpTsz8 The Fatty Acid Cycling Model
## 7Ix7Zm741GJd4euDmBlmtdmx2J4Zkexb                             
## PBDprVcM2IOyAyNcN2ZWzn909yvrkN0a Metallothioneins Bind Metals
## OI1C0HbU1QXP8nsCjvmxPJa9JCk2M70e       Response To Metal Ions
##                                                                id length
## 6SahW6t1EjGDYvbkaKXAAGmbsCdAUIxw 6SahW6t1EjGDYvbkaKXAAGmbsCdAUIxw      6
## dC9MsJ3GjmP2eZY89BOeBpA8k5QcUj9d dC9MsJ3GjmP2eZY89BOeBpA8k5QcUj9d      6
## yCkQDgF2o16F91Ce6Qnl1eClxbdpTsz8 yCkQDgF2o16F91Ce6Qnl1eClxbdpTsz8      5
## 7Ix7Zm741GJd4euDmBlmtdmx2J4Zkexb 7Ix7Zm741GJd4euDmBlmtdmx2J4Zkexb     14
## PBDprVcM2IOyAyNcN2ZWzn909yvrkN0a PBDprVcM2IOyAyNcN2ZWzn909yvrkN0a     11
## OI1C0HbU1QXP8nsCjvmxPJa9JCk2M70e OI1C0HbU1QXP8nsCjvmxPJa9JCk2M70e     14
## 
## Edges
## 
##                               from                               to
## 1 6SahW6t1EjGDYvbkaKXAAGmbsCdAUIxw dC9MsJ3GjmP2eZY89BOeBpA8k5QcUj9d
## 2 6SahW6t1EjGDYvbkaKXAAGmbsCdAUIxw yCkQDgF2o16F91Ce6Qnl1eClxbdpTsz8
## 3 7Ix7Zm741GJd4euDmBlmtdmx2J4Zkexb PBDprVcM2IOyAyNcN2ZWzn909yvrkN0a
## 4 7Ix7Zm741GJd4euDmBlmtdmx2J4Zkexb OI1C0HbU1QXP8nsCjvmxPJa9JCk2M70e
## 5 IXs3CeDFti8f9BfYIi5cheePqtIDBHu2 4EngvP3aQLtcz3YJlVk5SEtKKZ2Ukq4P
## 6 IXs3CeDFti8f9BfYIi5cheePqtIDBHu2 Oyut2jol4tUQ3Ag8eo2MVtir655DSbqx
hyp1 <- hypeR(c(list_signs[[1]], list_signs[[2]], list_signs[[3]]), rgsets_obj, background = rownames(exprs(eSet_wo_infl)))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(hyp1, fdr=0.1, top=20, merge = TRUE, title = "Pairwise w/ background=rownames(eset)", sizes = TRUE)

rctbl_mhyp(hyp1)
#this was used in the manuscript!!!
hyp2 <- hypeR(c(list_signs[[1]], list_signs[[2]], list_signs[[3]]), rgsets_obj, background = nrow(exprs(eSet_wo_infl)))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
#hyp_to_excel(hyp2, file_path = "~/Documents/Research/pml/pml_wo_infl/0906_reactome.xlsx")
rctbl_mhyp(hyp2)
hyp_dots(hyp2, fdr=0.1, top=15, merge = TRUE, title = "Pairwise w/ background=nrow(eset)", sizes = TRUE)

hyp_hmap(hyp2, fdr=0.1, top=15)
## $cancer.vs.control_up
## 
## $cancer.vs.control_down
## 
## $dys.vs.control_up
## 
## $dys.vs.control_down
## 
## $hknr.vs.control_up
## NULL
## 
## $hknr.vs.control_down
## NULL
## 
## $dys.vs.cancer_up
## 
## $dys.vs.cancer_down
## NULL
## 
## $hknr.vs.cancer_up
## 
## $hknr.vs.cancer_down
## 
## $hknr.vs.dys_up
## 
## $hknr.vs.dys_down
## NULL
#do hierarchical clust

.dots_multi_plot <- function(multihyp_data,
                             top=20,
                             abrv=50,
                             sizes=TRUE,
                             pval_cutoff=1, 
                             fdr_cutoff=1,
                             val=c("fdr", "pval"),
                             title="") {
    
    # Default arguments
    val <- match.arg(val)
    
    # Count significant genesets across signatures
    multihyp_dfs <- lapply(multihyp_data, function(hyp_obj) {
        hyp_obj$data %>%
        dplyr::filter(pval <= pval_cutoff) %>%
        dplyr::filter(fdr <= fdr_cutoff) %>%
        dplyr::select(label)
    })
    
    # Take top genesets
    labels <- names(sort(table(unlist(multihyp_dfs)), decreasing=TRUE))
    if (!is.null(top)) labels <- head(labels, top)
    
    # Handle empty dataframes
    if (length(labels) == 0) return(ggempty())
    
    # Create a multihyp dataframe
    dfs <- lapply(multihyp_data, function(hyp_obj) {
        hyp_df <- hyp_obj$data
        hyp_df[hyp_df$label %in% labels, c("label", val), drop=FALSE]
    })
    
    df <- suppressWarnings(Reduce(function(x, y) merge(x, y, by="label", all=TRUE), dfs))
    colnames(df) <- c("label", names(dfs))
    rownames(df) <- df$label
    df <- df[rev(labels), names(dfs)]
    
    # Abbreviate labels
    label.abrv <- substr(rownames(df), 1, abrv)
    if (any(duplicated(label.abrv))) {
        stop("Non-unique labels after abbreviating")
    } else {
        rownames(df) <- factor(label.abrv, levels=label.abrv)   
    }
    
    if (val == "pval") {
        cutoff <- pval_cutoff
        color.label <- "P-Value"
    }
    if (val == "fdr") {
        cutoff <- fdr_cutoff
        color.label <- "FDR"
    }
    
    df.melted <- reshape2::melt(as.matrix(df))
    colnames(df.melted) <- c("label", "signature", "significance")
    df.melted$size <- if(sizes) df.melted$significance else 1
    return(df.melted)
}

arrange the dotplot according to clustered groups

.reverselog_trans <- function(base=exp(1)) {
    trans <- function(x) -log(x, base)
    inv <- function(x) base^(-x)
    scales::trans_new(paste0("reverselog-", format(base)), trans, inv, 
              scales::log_breaks(base=base), 
              domain=c(1e-100, Inf))
}

df <- .dots_multi_plot(hyp2$data, top = 15, abrv = 75, sizes = TRUE, fdr_cutoff = 0.1, pval_cutoff = 0.05, val = "fdr", title)
df <- df[df$significance <= 0.1, ]
#rownames(df) <- NULL
df %>%
    ggplot(aes(x=signature, y=(factor(label)), color=significance, size=size)) +
    geom_point() +
    scale_color_continuous(low="#114357", high="#E53935", trans=.reverselog_trans(10)) +
    scale_size_continuous(trans=.reverselog_trans(10), guide="none") +
    theme_bw() + 
    ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1))

label_ordered <- c("Extracellular Matrix Organization", "Activation Of Matrix Metalloproteinases", "Degradation Of The Extracellular Matrix", "Cytokine Signaling In Immune System", "Interleukin 4 And Interleukin 13 Signaling", "Anti Inflammatory Response Favouring Leishmania Parasite Infection", "Fcgr Activation",  "Fcgr3a Mediated Il10 Synthesis",  "Immunoregulatory Interactions Between A Lymphoid And A Non Lymphoid Cell", "Formation Of The Cornified Envelope", "Collagen Formation", "Collagen Degradation", "Biological Oxidations", "Fatty Acids", "Cytochrome P450 Arranged By Substrate Type")
df <- df[order(match(df$label, label_ordered)),]

df$label <- factor(df$label, levels = rev(label_ordered))

df %>%
    ggplot(aes(x=signature, y=(factor(label)), color=significance, size=size)) +
    geom_point() +
    scale_color_continuous(low="#114357", high="#E53935", trans=.reverselog_trans(10)) +
    scale_size_continuous(trans=.reverselog_trans(10), guide="none") +  theme_bw() 

# +theme(axis.title.y=element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none")

#for manuscript
p1 <- df %>%
    ggplot(aes(x=signature, y=(factor(label)), color=significance, size=size)) +
    geom_point() +
    scale_color_continuous(low="#114357", high="#E53935", trans=.reverselog_trans(10)) +
    scale_size_continuous(trans=.reverselog_trans(10), guide="none") +  theme_bw() +
    theme(axis.title.y=element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none")

p1

KEGG

with fdr = 0.1

KEGG <- msigdb_gsets(species="Homo sapiens", category = 'C2', subcategory = 'CP:KEGG')
#enrichr_gsets("KEGG_2019_Human")
lmultihyp1 <- hypeR(list_signs[[1]], KEGG)
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, title = "PML with Controls")

rctbl_mhyp(lmultihyp1)
lmultihyp2 <- hypeR(list_signs[[2]], KEGG)
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
hyp_dots(lmultihyp2, fdr=0.1, merge = TRUE, title = "PML with Cancer")

rctbl_mhyp(lmultihyp2)
lmultihyp3 <- hypeR(list_signs[[3]], KEGG)
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp3, fdr=0.1, merge = TRUE, title = "Pairwise PML")

rctbl_mhyp(lmultihyp3)

GO

with fdr = 0.1

GO <- msigdb_gsets("Homo sapiens", "C5", subcategory = 'CC')
names(GO$genesets) <- names(GO$genesets) %>% strsplit( "GOMF_" ) %>% sapply( tail, 1 )
length(unique(names(GO$genesets)))
## [1] 996
lmultihyp1 <- hypeR(list_signs[[1]], GO)
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, title = "PML with Controls")

rctbl_mhyp(lmultihyp1)
lmultihyp2 <- hypeR(list_signs[[2]], GO)
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
hyp_dots(lmultihyp2, fdr=0.1, merge = TRUE, title = "PML with Cancer")

rctbl_mhyp(lmultihyp2)
lmultihyp3 <- hypeR(list_signs[[3]], GO)
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp3, fdr=0.1, merge = TRUE, title = "Pairwise PML")

rctbl_mhyp(lmultihyp3)
lmultihyp1 <- hypeR(c(list_signs[[1]],list_signs[[2]], list_signs[[3]]) , GO, fdr = 0.1)
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, top = 50, title = "All Pairwise", abrv = 50) 

rctbl_mhyp(lmultihyp1)